#!/usr/bin/env Rscript

#### Preamble
         library(cem)

  # |--------------------------+------------+------------+--------------+----------|
  # | Options:                 |            |            |              |          |
  # |--------------------------+------------+------------+--------------+----------|
  # | Impute gender            | IMPUTE     |            |              |          |
  # | Expunge by Cal policy    | FORCECULL  |            |              |          |
  # | Opposite sex only        | OPPSEX     |            |              |          |
  # | Gender specific          | MALE       | FEMALE     |              |          |
  # | Age                      | UNDER65    | OVER65     | 65TO79       | 80OROVER |
  # | Voting                   | SAME       | DIFFERENT  | MORE         | FEWER    |
  # | Voting                   | NOABSTAIN  | ABSTAIN    |              |          |
  # | Voting                   | ONEABSTAIN | TWOABSTAIN | THREEABSTAIN |          |
  # | Occupancy                | OCC2       | OCC3PLUS   |              |          |
  # | Add zip code data        | ZIP        |            |              |          |
  # | Pop density >= / < 1000  | RURAL      | URBAN      |              |          |
  # |--------------------------+------------+------------+--------------+----------|
           
         option <- list(c("MORE"), c("FEWER"), c("TWOABSTAIN", "SAME"), c("THREEABSTAIN", "SAME"))
         option.name <- c("More Votes than Alter - Half and Aggregated - Given 2004", "Fewer Votes than Alter - Half and Aggregated - Given 2004", "Same Voting History - Two Abstains - Half and Aggregated - Given 2004", "Same Voting History - Half and Aggregated - Three Abstains - Given 2004")

### Extract File
   setwd('/home/whobbs/data/cavr')
   
   # find column numbers for AWK command
   header<-1:89
   names(header)<-c("pid_e","lastname_e","zip_e","state_e","localitycode_e","precinct_e","precinctpart_e","regdate_e","dob_e","dod_e","ageatdeath_e","gender_e","party_e","gg10_e","age_gg10_e","dfdgg10_e","gp10_e","age_gp10_e","dfdgp10_e","ss9_e","age_ss9_e","dfdss9_e","pg8_e","age_pg8_e","dfdpg8_e","dp8_e","pp8_e","gg6_e","gp6_e","ss5_e","pg4_e","pp4_e","male_e","female_e","dem_e","rep_e","death_e","hid","hid_occnum","pid_a","lastname_a","zip_a","state_a","localitycode_a","precinct_a","precinctpart_a","regdate_a","dob_a","dod_a","ageatdeath_a","gender_a","party_a","gg10_a","age_gg10_a","dfdgg10_a","gp10_a","age_gp10_a","dfdgp10_a","ss9_a","age_ss9_a","dfdss9_a","pg8_a","age_pg8_a","dfdpg8_a","dp8_a","pp8_a","gg6_a","gp6_a","ss5_a","pg4_a","pp4_a","male_a","female_a","dem_a","rep_a","death_a","ea_agediscrep","pop_density","median_income","avg_income","percapita_income","percent_foreign_naturalized", "hid_e_2010", "mover_e", "hid_a_2010", "mover_a", "registration", "birthplace", "language")
   header<-as.list(header)
   
   my.vars <- with(header, paste(age_gg10_a, age_gg10_e, ea_agediscrep, female_a, female_e, male_a, male_e, hid_occnum, ageatdeath_a, death_a, dfdgg10_a, dfdgp10_a, dfdss9_a, dem_a, dem_e, rep_a, rep_e, gg10_a, pg8_a, gg6_a, gp6_a, ss5_a, pg4_a, gg10_e, gp10_e, ss9_e, pg8_e, pp8_e, gg6_e, gp6_e, ss5_e, pg4_e, sep = ", $"))
   
   if ("ZIP" %in% unlist(option)){ # needed for Rural/Urban analysis
  my.vars <- paste(my.vars, header$pop_density, header$percapita_income, sep = ", $")
  }
  
   my.vars <- gsub("^", "$", my.vars)
   
   cmd<-paste("awk -F, '{print ", my.vars, "}' OFS=, cavr_analysisfile_withregdat.csv | sed -n '1~2p' > temp_analysis_file.csv", sep = "") # every second line
  
   # extract only the necessary columns
   system(cmd, wait = TRUE)

### Read File
  system('head -n10 temp_analysis_file.csv > temp_analysis_file_head.csv', wait = TRUE)
  for.classes <- read.csv('temp_analysis_file_head.csv', head = TRUE, stringsAsFactors = FALSE)
  names(for.classes) <- gsub("_", ".", names(for.classes)) # convert column names to R format
  casecontrol.names <- names(for.classes) # save names
  casecontrol.classes <- lapply(for.classes, class) # save column classes for faster read.csv
  
  casecontrol <- read.csv('temp_analysis_file.csv', head = TRUE, colClasses = casecontrol.classes)
  names(casecontrol) <- casecontrol.names # reassign names
  
  system("rm temp_analysis_file.csv", wait = FALSE)
  system("rm temp_analysis_file_head.csv", wait = FALSE)


#### Prep

### R format
     # remove egos missing from the 2010 voter record, and missing live alters
     
  casecontrol<-casecontrol[complete.cases(casecontrol$gg10.e)&complete.cases(casecontrol$gg10.a),] # gg10.a is 989898 for deceased alters present in 2010 voter record and 999999 for deceased alters absent; gg10.a is NA for non-deceased alters absent
  
     # change 989898 values to NA
  casecontrol[casecontrol$gg10.a==989898,]$gg10.a <- NA
     # change 999999 values to NA
  
  casecontrol[casecontrol$ageatdeath.a==999999,]$ageatdeath.a <- NA
  casecontrol[casecontrol$dfdgg10.a==999999,]$dfdgg10.a <- NA
  casecontrol[casecontrol$dfdgp10.a==999999,]$dfdgp10.a <- NA
  casecontrol[casecontrol$dfdss9.a==999999,]$dfdss9.a <- NA
  casecontrol <- casecontrol[casecontrol$pg4.e == 1 & casecontrol$pg4.a == 1,]

### CEM prep

    agediscrep.cuts <- c(-15.5, -5.5, -1.5, 1.5, 5.5, 15.5)
    age.cuts <- c(17.5,24.5,34.5,44.5,49.5,54.5,59.5,64.5,69.5,74.5,79.5,84.5,89.5,94.5,115.5)
    hid.cuts <- c(.5,1.5,2.5,6.5)
    week.cuts <- c(14.5, 25.5, 51.5, 80.5)
    
    # weeks to Gubernatorial General 2010
    week.begin <- c(-91 - 3/7, -52, -15, -15, 0 + 1/7, 15 + 1/7, 52 + 2/7)
    week.begin <- week.begin * 7
    week.end <- c(-52 - 2/7, -15 - 1/7, 0 - 1/7, 15, 15, 52, 80 + 3/7)
    week.end <- week.end * 7
    week <- 1:length(week.begin)

    setwd('/home/whobbs/scripts/widower_effects_analysis')



  # begin option (i) loop
    for (i in 1:length(option)){

   
  cat("\nPartition:\n")
  print(option.name[[i]])
  print(option[[i]])
  
  my.cutpoints <- list(ea.agediscrep = agediscrep.cuts, age.gg10.a = age.cuts, hid.occnum = hid.cuts)
  
  cat("\n\nCutpoints:\n")
  print(my.cutpoints)


### Set drop vars

    to.drop <- c("death.a","age.gg10.e","dfdgg10.a", "dfdgp10.a", "dfdss9.a", "ageatdeath.a","gg10.e","gp10.e","ss9.e","pg8.e","pp8.e","gg10.a","pg8.a","gg6.a","gp6.a","ss5.a","pg4.a")

    if ("ZIP" %in% unlist(option)){ # pop.density and percapita.income do not vary within zip codes
      to.drop<-c(to.drop, "pop.density", "percapita.income")
    }
    if ("REGDATE" %in% unlist(option) & !("REGDATE" %in% option[[i]])){
      to.drop <- c(to.drop, "regdate.e", "regdate.a")
    }
  
  # display matching terms
    cat("\nMatching on:\n")
    print(names(casecontrol)[!(names(casecontrol) %in% to.drop)])
    cat("\n")


### Output prep
    # write.csv directories and file names
  output.gg10<-paste("output/", option.name[i], " gg10.csv", sep = "")
  output.gp10<-paste("output/", option.name[i], " gp10.csv", sep = "")
  output.ss9<-paste("output/", option.name[i], " ss9.csv", sep = "")
  output.original<-paste("output/", option.name[i], " original.csv", sep = "")
  
  source("summary_scripts/stat_blocks.R")
  
  write.table(rbind(original.sample.header), output.original, col.names = FALSE, row.names = FALSE, sep = ",")
  write.table(rbind(matched.sample.header), output.gg10, col.names = FALSE, row.names = FALSE, sep = ",")
  write.table(rbind(matched.sample.header), output.gp10, col.names = FALSE, row.names = FALSE, sep = ",")
  write.table(rbind(matched.sample.header), output.ss9, col.names = FALSE, row.names = FALSE, sep = ",")

  my.casecontrol <- casecontrol
#### Options
    
### Impute gender

  if ("IMPUTE" %in% option[[i]]){
    if (my.casecontrol$male.e==1&my.casecontrol$male.a==0&my.casecontrol$female.a==0){
      my.casecontrol$female.a<-1
    }
    if (my.casecontrol$female.e==1&my.casecontrol$male.a==0&my.casecontrol$female.a==0){
      my.casecontrol$male.a<-1
    }
    if (my.casecontrol$male.a==1&my.casecontrol$male.e==0&my.casecontrol$female.e==0){
      my.casecontrol$female.e<-1
    }
    if (my.casecontrol$female.a==1&my.casecontrol$male.e==0&my.casecontrol$female.e==0){
      my.casecontrol$male.e<-1
    }
  }

### Opposite sex only

  if ("OPPSEX" %in% option[[i]]){
  my.casecontrol <- my.casecontrol[my.casecontrol$male.e==my.casecontrol$female.a&my.casecontrol$female.a==1|my.casecontrol$female.e==my.casecontrol$male.a&my.casecontrol$male.a==1,]
  }

### Expunge

  if ("FORCECULL" %in% option[[i]]){
    my.casecontrol<-my.casecontrol[my.casecontrol$pg4.e==1&my.casecontrol$pg4.a==1&my.casecontrol$pg8.e==1&my.casecontrol$pg8.a==1,]
  }

### Gender

  if ("MALE" %in% option[[i]]) {
    my.casecontrol<-my.casecontrol[my.casecontrol$male.e==1,]
  }
  if ("FEMALE" %in% option[[i]]) {
    my.casecontrol<-my.casecontrol[my.casecontrol$female.e==1,]
  }

### Age

  if ("UNDER65" %in% option[[i]]) {
    my.casecontrol<-my.casecontrol[my.casecontrol$age.gg10.e<=64,]
  }
  if ("65OROVER" %in% option[[i]]) {
    my.casecontrol<-my.casecontrol[my.casecontrol$age.gg10.e>=65,]
  }
  if ("65TO79" %in% option[[i]]) {
    my.casecontrol<-my.casecontrol[my.casecontrol$age.gg10.e>=65&my.casecontrol$age.gg10.e<=79,]
  }
  if ("80OROVER" %in% option[[i]]) {
    my.casecontrol<-my.casecontrol[my.casecontrol$age.gg10.e>=80,]
  }

### Voting

  if ("SAME" %in% option[[i]]) {
    my.casecontrol<-my.casecontrol[my.casecontrol$gg6.e==my.casecontrol$gg6.a&my.casecontrol$gp6.e==my.casecontrol$gp6.a&my.casecontrol$ss5.e==my.casecontrol$ss5.a&my.casecontrol$pg4.e==my.casecontrol$pg4.a,]
  }
  if ("DIFFERENT" %in% option[[i]]) {
    my.casecontrol<-my.casecontrol[my.casecontrol$gg6.e!=my.casecontrol$gg6.a|my.casecontrol$gp6.e!=my.casecontrol$gp6.a|my.casecontrol$ss5.e!=my.casecontrol$ss5.a|my.casecontrol$pg4.e!=my.casecontrol$pg4.a,]
  }
  if ("MORE" %in% option[[i]]) {
    my.casecontrol<-my.casecontrol[(my.casecontrol$gg6.e+my.casecontrol$gp6.e+my.casecontrol$ss5.e+my.casecontrol$pg4.e)>(my.casecontrol$gg6.a+my.casecontrol$gp6.a+my.casecontrol$ss5.a+my.casecontrol$pg4.a),]
  }
  if ("FEWER" %in% option[[i]]) {
    my.casecontrol<-my.casecontrol[(my.casecontrol$gg6.e+my.casecontrol$gp6.e+my.casecontrol$ss5.e+my.casecontrol$pg4.e)<(my.casecontrol$gg6.a+my.casecontrol$gp6.a+my.casecontrol$ss5.a+my.casecontrol$pg4.a),]
  }
  if ("NOABSTAIN" %in% option[[i]]) {
    my.casecontrol<-my.casecontrol[my.casecontrol$gg6.e==1&my.casecontrol$gp6.e==1&my.casecontrol$ss5.e==1&my.casecontrol$pg4.e==1,]
  }
  if ("ABSTAIN" %in% option[[i]]) {
    my.casecontrol<-my.casecontrol[my.casecontrol$gg6.e==0|my.casecontrol$gp6.e==0|my.casecontrol$ss5.e==0|my.casecontrol$pg4.e==0,]
  }
  if ("ONEABSTAIN" %in% option[[i]]) {
    my.casecontrol<-my.casecontrol[(my.casecontrol$gg6.e+my.casecontrol$gp6.e+my.casecontrol$ss5.e+my.casecontrol$pg4.e)==3,]
  }
  if ("TWOABSTAIN" %in% option[[i]]) {
    my.casecontrol<-my.casecontrol[(my.casecontrol$gg6.e+my.casecontrol$gp6.e+my.casecontrol$ss5.e+my.casecontrol$pg4.e)==2,]
  }
  if ("THREEABSTAIN" %in% option[[i]]) {
    my.casecontrol<-my.casecontrol[(my.casecontrol$gg6.e+my.casecontrol$gp6.e+my.casecontrol$ss5.e+my.casecontrol$pg4.e)==1,]
  }

### Occupancy

  if ("OCC2" %in% option[[i]]) {
    my.casecontrol<-my.casecontrol[my.casecontrol$hid.occnum==2,]
  }
  if ("OCC3PLUS" %in% option[[i]]) {
    my.casecontrol<-my.casecontrol[my.casecontrol$hid.occnum>=3,]
  }

### Rural/Urban

  if ("RURAL" %in% option[[i]]) {
   my.casecontrol<-my.casecontrol[my.casecontrol$pop.density<1000,]
  }
  if ("URBAN" %in% option[[i]]) {
    my.casecontrol<-my.casecontrol[my.casecontrol$pop.density>=1000,]
  }

### Rich/poor

  if ("RICH" %in% option[[i]]) {
   my.casecontrol<-my.casecontrol[my.casecontrol$percapita.income>=35000,]
  }
  if ("POOR" %in% option[[i]]) {
    my.casecontrol<-my.casecontrol[my.casecontrol$percapita.income<35000,]
  }

### Registration

  if ("I" %in% option[[i]]) {
   my.casecontrol<-my.casecontrol[my.casecontrol$registration=="I",]
  }
  if ("M" %in% option[[i]]) {
   my.casecontrol<-my.casecontrol[my.casecontrol$registration=="M",]
  }
  if ("O" %in% option[[i]]) {
   my.casecontrol<-my.casecontrol[my.casecontrol$registration=="O",]
  }
  if ("D" %in% option[[i]]) {
   my.casecontrol<-my.casecontrol[my.casecontrol$registration=="D",]
  }
  if ("R" %in% option[[i]]) {
   my.casecontrol<-my.casecontrol[my.casecontrol$registration=="R",]
  }
  if ("U" %in% option[[i]]) {
   my.casecontrol<-my.casecontrol[my.casecontrol$registration=="U",]
  }  
  
### Birthplace

  if ("CA" %in% option[[i]]) {
   my.casecontrol<-my.casecontrol[my.casecontrol$birthplace=="CA",]
  }

  if ("NOTCA" %in% option[[i]]) {
   my.casecontrol<-my.casecontrol[my.casecontrol$birthplace!="CA",]
  }    

#### Matching & CEM

  # Start week loop, note original summary stats, match, and calculate treatment effects

  # note: week.begin & week.end do not overlap
  
        for (j in 3:length(week.begin)) {
    casecontrol.select <- my.casecontrol[my.casecontrol$dfdgg10.a>=week.begin[j]&my.casecontrol$dfdgg10.a<=week.end[j]|is.na(my.casecontrol$dfdgg10.a),]

    original.cases <- casecontrol.select[casecontrol.select$death.a==1,]
    source('summary_scripts/stat_block_output_original.R')
    write.table(rbind(original.cases.stats), file = output.original, append = TRUE, col.names = FALSE, row.names = FALSE, sep = ",")
    rm(original.cases)

    mat<-cem(treatment = "death.a", data = casecontrol.select, drop = to.drop, cutpoints = my.cutpoints)
    cases.in.model<-casecontrol.select[mat$matched==1&casecontrol.select$death.a==1,]

    est<-att(mat, gg10.e ~ death.a, data = casecontrol.select)
    source('summary_scripts/stat_block_output.R')
    source('summary_scripts/stat_cem_output.R')
    write.table(rbind(cases.cem.stats), file = output.gg10, append = TRUE, col.names = FALSE, row.names = FALSE, sep = ",")

      rm(mat)
      rm(est)
      rm(casecontrol.select)
      rm(cases.in.model)
        }

        for (j in 3:6) {
    casecontrol.select <- my.casecontrol[my.casecontrol$dfdgp10.a>=week.begin[j]&my.casecontrol$dfdgp10.a<=week.end[j]|is.na(my.casecontrol$dfdgp10.a),]

    original.cases <- casecontrol.select[casecontrol.select$death.a==1,]
    source('summary_scripts/stat_block_output_original.R')
    write.table(rbind(original.cases.stats), file = output.original, append = TRUE, col.names = FALSE, row.names = FALSE, sep = ",")
    rm(original.cases)

    mat<-cem(treatment = "death.a", data = casecontrol.select, drop = to.drop, cutpoints = my.cutpoints)
    cases.in.model<-casecontrol.select[mat$matched==1&casecontrol.select$death.a==1,]

    est<-att(mat, gp10.e ~ death.a, data = casecontrol.select)
    source('summary_scripts/stat_block_output.R')
    source('summary_scripts/stat_cem_output.R')
    write.table(rbind(cases.cem.stats), file = output.gp10, append = TRUE, col.names = FALSE, row.names = FALSE, sep = ",")

      rm(mat)
      rm(est)
      rm(casecontrol.select)
      rm(cases.in.model)
        }

        for (j in 1:3) {
    casecontrol.select <- my.casecontrol[my.casecontrol$dfdss9.a>=week.begin[j]&my.casecontrol$dfdss9.a<=week.end[j]|is.na(my.casecontrol$dfdss9.a),]

    original.cases <- casecontrol.select[casecontrol.select$death.a==1,]
    source('summary_scripts/stat_block_output_original.R')
    write.table(rbind(original.cases.stats), file = output.original, append = TRUE, col.names = FALSE, row.names = FALSE, sep = ",")
    rm(original.cases)

    mat<-cem(treatment = "death.a", data = casecontrol.select, drop = to.drop, cutpoints = my.cutpoints)
    cases.in.model<-casecontrol.select[mat$matched==1&casecontrol.select$death.a==1,]

    est<-att(mat, ss9.e ~ death.a + death.a, data = casecontrol.select)
    source('summary_scripts/stat_block_output.R')
    source('summary_scripts/stat_cem_output.R')
    write.table(rbind(cases.cem.stats), file = output.ss9, append = TRUE, col.names = FALSE, row.names = FALSE, sep = ",")

      rm(mat)
      rm(est)
      rm(casecontrol.select)
      rm(cases.in.model)
      }
    
    rm(my.casecontrol)
     
    # end option (i) loop
    }
    


